last updated: July 25 2025
Manuscript Title:
Authors: Note: Microsoft copilot was used to clean up code.
Affiliations:
Description of the dataset:
Here, we seek to evaluate the effect of a diet on the microbiome as well its ability to buffer against effects induced by Candida colonization.
To accomplish this, we read in the following data: 1. phyloseq object generated by Cecilia from merged reads after taxonomic glom to the rank of genus 2. phyloseq object generated by Cecilia from merged reads
Here, we render the following figures:
Figure 1 Supplementary Figure 1 Supplementary Figure 2 Supplementary Figure 3 —
library(knitr)
opts_chunk$set(fig.width=12, fig.height=8,
echo=TRUE, warning=FALSE, message=FALSE, error = FALSE)
Load R packages
# Set seed
set.seed(78979)
# Define required packages
packages <- c(
# Core tidyverse packages
"tidyverse",
# Phylogenetics and microbiome analysis
"phyloseq",
"microbiome",
"phangorn",
"msa",
# Statistical modeling and analysis
"lmerTest",
"rstatix",
"broom",
"broom.mixed",
"genefilter",
"vegan",
"ALDEx2",
# Visualization enhancements
"ggpubr",
"ggprism",
"gridExtra",
"ggbeeswarm",
"ggrepel",
"cowplot",
"ggtext",
# Data import
"readxl"
)
# Install missing CRAN packages
cran_packages <- setdiff(packages, rownames(installed.packages()))
cran_packages <- cran_packages[!cran_packages %in% c("phyloseq", "microbiome", "phangorn", "msa", "genefilter", "vegan", "ALDEx2")]
if(length(cran_packages) > 0) install.packages(cran_packages)
# Install missing Bioconductor packages
bioc_packages <- c("phyloseq", "microbiome", "phangorn", "msa", "genefilter", "vegan", "ALDEx2")
bioc_to_install <- setdiff(bioc_packages, rownames(installed.packages()))
if(length(bioc_to_install) > 0) BiocManager::install(bioc_to_install, update = FALSE)
# Load packages
n <- length(packages) - sum(sapply(packages, require, character.only = TRUE))
# Print loading status
if (n == 0) {
print("All necessary R packages loaded properly")
} else {
print(paste0(n, " R packages did not load properly"))
}
## [1] "All necessary R packages loaded properly"
#set graphs to the graphpad prism theme
theme_set(theme_prism())
#read in the phyloseq object
ps = readRDS("~/Desktop/perez/PerezDiet_merged_Len250_EE5_5_RefSeq_phyGenus_tre.rds")
tax_df = data.frame(ps@tax_table@.Data)
#make the mapping file
map = read.csv("~/Desktop/perez/Updated_metadata.csv")
rownames(map) = map$SampleID
#dim(map)
#fix the day labels
map$Day = as.factor(as.character(map$Day))
map$Time_label <- plyr::revalue(map$Day, c(
"-8" = "T1",
"-7" = "T1",
"-1" = "T2",
"10" = "T3",
"12" = "T3",
"13" = "T3"))
#change from fatty acid diet to "HOA"
map$Diet <- plyr::revalue(map$Diet, c("FattyAcid"="HOA", "Control"="control (AIN-93G)"))
sample_data(ps) <- sample_data(map)
we will look at these plots a few ways. in the first iteration, we will look at the data with the x axis ordered by cage number and then mouse number 1:2
# Transform to relative abundance
ps_relabund <- transform_sample_counts(ps, function(x) x / sum(x))
# Identify top 15 genera by total abundance
TopNOTUs <- names(sort(taxa_sums(ps_relabund), TRUE)[1:15])
top15 <- prune_species(TopNOTUs, ps_relabund)
tax = data.frame(top15@tax_table@.Data)
top_genera = unique(tax$Genus)
# Melt to long format
df_long <- psmelt(ps_relabund)
# Collapse all other genera into "Other"
df_long <- df_long %>%
mutate(Genus = ifelse(Genus %in% top_genera, Genus, "Other"))
# Define 16 visually distinct hex colors
palette <- c(
"#1f77b4",
"#ff7f0e",
"#2ca02c",
"#d62728",
"#9467bd",
"#8c564b",
"#e377c2",
"#7f7f7f",
"#bcbd22",
"#17becf",
"#aec7e8",
"#ffbb78",
"#98df8a",
"#ff9896",
"#c5b0d5",
"#c49c94"
)
#define time point colors
time_colors <- c("#0072B2", "#E69F00", "#009E73")
#make a new factor
df_long$Plot_factor = paste0(df_long$Cage, ";", df_long$Day, ";", df_long$Mouse)
# Plot
#ggplot(df_long, aes(x = Plot_factor, y = Abundance, fill = Genus)) +
# geom_bar(stat = "identity", alpha = 0.8) +
# facet_wrap(Group ~ Diet, scales = "free_x", ncol=2) +
# scale_fill_manual(values = palette) +
# labs(
# x = "Sample (Ordered by Target Genera Abundance)",
# y = "Relative Abundance"
# ) +
# theme_minimal() +
#theme(axis.text.x = element_text(angle = 90, hjust = 1))
lets look to see if sequencing depth differs between fatty acid and control
we can see that the time point 1 samples have less reads than the other time points, though this difference is only marginally significant.
df = data.frame(Depth = sample_sums(ps), sample_data(ps))
df$My.group = ifelse(df$Group %in% c("G1", "G2"), "pre", "post")
df$Time_label = factor(df$Time_label, levels=c("T1", "T2", "T3"))
p = ggplot(df, aes(Time_label, Depth)) + geom_boxplot() + facet_wrap(~Diet) + stat_compare_means()
p
if we drop the day -8 samples than this difference goes away. there is no significant difference between sequencing depth of the pre and post samples for the diet comparisons.
df1 = subset(df, Day != "-8")
df1$Time_label = factor(df1$Time_label, levels=c("T1", "T2", "T3"))
pa = ggplot(df1, aes(Time_label, Depth)) + geom_boxplot() + facet_wrap(~Diet) + stat_compare_means()
pb = ggplot(df1, aes(My.group, Depth)) + geom_boxplot() + facet_wrap(~Diet) + stat_compare_means()
pc = ggplot(df1, aes(Diet, Depth)) + geom_boxplot() + stat_compare_means()
grid.arrange(pa, pb, pc, ncol=3)
now let’s look at this instead based on the relative abundance of taxa later identified as differentiating days and by diet
#re order based on relative abundance of lactobacillus, bacteroides and Barnesiella
target_genera <- c("Lactobacillus", "Bacteroides", "Barnesiella")
#subset on the target genera
myphy = subset_taxa(ps, Genus %in% target_genera)
#sum the abundances of the target taxa and create a rank based measure of the samples most (1) to least abundant with these taxa
df = data.frame(SampleID = sample_names(ps), TargetOrgAbundance=sample_sums(ps)) %>%
arrange(desc(TargetOrgAbundance))
df$Myorder = 1:nrow(df)
#merge the annotation of rank order for the target taxa with the tax table
df_long_annotated = merge(df, df_long)
df_long_annotated$Myorder <- as.numeric(as.character(df_long_annotated$Myorder))
# Make sure Time_label is a factor with desired order
df_long_annotated$Time_label <- factor(df_long_annotated$Time_label, levels = c("T1", "T2", "T3"))
# Make sure Myorder is a factor with unique levels in order
df_long_annotated$Myorder <- factor(df_long_annotated$Myorder, levels = unique(df_long_annotated$Myorder))
#make the plot
fig1D <- ggplot(df_long_annotated, aes(x = Myorder, y = Abundance, fill = Genus)) +
geom_bar(stat = "identity", position = "stack", alpha=0.9) +
scale_fill_manual(values = palette) +
labs(
x = "Sample (Ordered by Target Genera Abundance)",
y = "Relative Abundance"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(), # hide x axis text
axis.ticks.x = element_blank(), # hide x axis ticks
legend.text = element_text(face = "italic"),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ Time_label + Diet, scales = "free_x", ncol = 2)
fig1D
#save the figures
ggsave(fig1D, file="~/Desktop/github/perez_diet/fig1D.pdf", device="pdf", width=7, height=10)
ggsave(fig1D, file="~/Desktop/github/perez_diet/fig1D.eps", device="eps", width=6, height = 10)
To calculate alpha diversity, we have to use the phyloseq object straight from dada2 since some diversity metrics use singletons in their calculation of diversity. As a result, we will read in the original phyloseq object prior to aggregation to the genus level.
There is a consistent drop in microbial diversity across all diversity metrics in the Control group over time. The Control group experienced a measurable and statistically significant decline in microbial diversity across all indices.
The FattyAcid group exhibited a universal trend toward reduced richness and diversity. However, only Simpspon and Inverse Simpson were marginally significant, suggesting that we likely will see an increase in the dominance of the community in the fatty acid group, rather than an overall change in the number of species or abundance of rare taxa, as we see with the controls.Moreover, this change is not so marked, and we may conclude that the fatty acid diet is stabilizing microbial diversity.
phy <- readRDS("~/Desktop/perez/PerezDiet_merged_Len250_EE5_5_RefSeq.rds")
#update the metadata with cage information
map = read.csv("~/Desktop/perez/Updated_metadata.csv")
rownames(map) = map$SampleID
#fix the labels
map$Time_label <- plyr::revalue(as.factor(map$Day), c(
"-8" = "T1",
"-7" = "T1",
"-1" = "T2",
"10" = "T3",
"12" = "T3",
"13" = "T3"))
#change from fatty acid diet to "HOA"
map$Diet <- plyr::revalue(map$Diet, c("FattyAcid"="HOA", "Control"="control (AIN-93G)"))
sample_data(phy) <- sample_data(map)
#estimate species richness
df = data.frame(estimate_richness(phy, measures=c("Observed", "Chao1", "Shannon", "Simpson")), sample_data(phy)) %>%
reshape2::melt(., c("se.chao1", colnames(map)))
# Generate all pairwise combinations
time_points = c("T1", "T2", "T3")
mycomparisons <- combn(time_points, 2, simplify = FALSE)
#force the ordering of the x axis
ordering = c("T1", "T2", "T3")
df$Time_label = factor(df$Time_label, levels=ordering)
# Define pairwise time comparisons
time_points <- list(c("T1", "T2"), c("T1", "T3"), c("T2", "T3"))
# ----- 1. Pairwise Wilcoxon tests: Within each diet -----
df2 <- df %>%
dplyr::rename(diversity_measure = variable)
within_diet_pvals <- df2 %>%
group_by(diversity_measure, Diet) %>%
pairwise_wilcox_test(
value ~ Time_label,
p.adjust.method = "BH",
comparisons = time_points
) %>%
mutate(comparison_type = "Within Diet",
group = Diet) %>%
add_y_position()
# ----- 2. Wilcoxon tests: Between diets at each time point -----
between_diet_pvals <- df2 %>%
group_by(diversity_measure, Time_label) %>%
wilcox_test(value ~ Diet) %>% # no p.adjust.method here
ungroup() %>%
group_by(diversity_measure) %>%
mutate(p.adj = p.adjust(p, method = "BH")) %>%
add_significance() %>%
mutate(comparison_type = "Between Diets",
group = Time_label) %>%
# Specify the data and formula here:
add_y_position(data = df2, formula = value ~ Diet) %>%
dplyr::select(c("Time_label", "diversity_measure", "group1", "group2", "p", "p.adj", "p.adj.signif"))
write.csv(between_diet_pvals, file="~/Desktop/github/perez_diet/between_diet_same_timepoint_pvalues.csv")
# ----- 4. Plot -----
within_diet_pvals2 <- within_diet_pvals %>%
mutate(y.position = case_when(
diversity_measure == "Simpson" ~ seq(1.01, 1.001, length.out = n()),
diversity_measure == "Shannon" ~ seq(6.9, 7.5, length.out = n()),
diversity_measure == "Chao1" ~ seq(1600, 2300, length.out = n()),
diversity_measure == "Observed"~ seq(1400, 2100, length.out = n()),
TRUE ~ y.position
))
#define a function to generate the pvalues for each of the diversity metrics independently so the p values aren't overplotted
plot_diversity_metric <- function(data, pval_data, diversity_metric, time_colors) {
# Filter the data and p-values for the selected diversity metric
df_sub <- data %>% filter(diversity_measure == diversity_metric)
pval_sub <- pval_data %>% filter(diversity_measure == diversity_metric)
# Custom y.position adjustment depending on metric
pval_sub <- pval_sub %>%
group_by(Diet) %>%
mutate(
# Stagger brackets by adding 0.5 per row index
y.position = case_when(
diversity_metric == "Shannon" ~ y.position,
diversity_metric == "Simpson" ~ seq(0.9999, by = 0.0001, length.out = n()),
diversity_metric == "Chao1" ~ seq(max(df_sub$value) * 1.05, by = 50, length.out = n()),
diversity_metric == "Observed" ~ seq(max(df_sub$value) * 1.05, by = 100, length.out = n()),
TRUE ~ y.position
)
) %>%
ungroup()
# Build the plot
ggplot(df_sub, aes(x = Time_label, y = value, fill = Time_label)) +
geom_boxplot(outlier.shape = NA, alpha = 0.8) +
geom_beeswarm(dodge.width = 0.8, cex = 1) +
stat_pvalue_manual(
pval_sub,
label = "p.adj.signif",
tip.length = 0.01,
bracket.size = 0.4,
size = 3
) +
facet_grid(rows = vars(diversity_measure), cols = vars(Diet), scales = "free") +
scale_fill_manual(values = time_colors) +
theme_minimal(base_family = "Helvetica")+
labs(
x = "",
y = diversity_metric
) +
theme(
legend.position = "none",
plot.title = element_text(face = "italic", hjust = 0.5),
strip.text.y = element_blank() # 👈 Hides the facet strip label on the right
)
}
SFig1A = plot_diversity_metric(df2, within_diet_pvals2, "Observed", time_colors)
SFig1B = plot_diversity_metric(df2, within_diet_pvals2, "Simpson", time_colors)
SFig1C = plot_diversity_metric(df2, within_diet_pvals2, "Chao1", time_colors)
SFig1 = cowplot::plot_grid(SFig1A, SFig1B, SFig1C, ncol=1, labels=c("A", "B", "C"))
SFig1
ggsave(SFig1, file="~/Desktop/github/perez_diet/SFig1.pdf", device="pdf", width = 6, height = 11)
Let’s look at Shannon Diversity
fig1B = plot_diversity_metric(df2, within_diet_pvals2, "Shannon", time_colors)
fig1B
ggsave(fig1B, file="~/Desktop/github/perez_diet/fig1B.pdf", device="pdf", width = 6, height = 6)
ggsave(fig1B, file="~/Desktop/github/perez_diet/fig1B.eps", device="eps", width = 6, height = 6)
To further refine the dataset, we applied both prevalence and abundance thresholds, removing taxa that were present in fewer than 2 samples and had fewer than 100 total reads. This filtering step reduced the number of genera from 182 to just 46, allowing us to focus on the more consistently represented and biologically relevant taxa.
We do this filtering after calculating alpha diversity since some diversity metrics rely on the presence of singletons to estimate the “unseen” diversity.
#let's filter out taxa that aren't present in at least 2 samples at 100 reads
flist <- filterfun(kOverA(2, 100))
phy_filt <- filter_taxa(ps, flist, TRUE) #down to 46 genera if 100 reads required;
phy_filt
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 46 taxa and 72 samples ]
## sample_data() Sample Data: [ 72 samples by 12 sample variables ]
## tax_table() Taxonomy Table: [ 46 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 46 tips and 45 internal nodes ]
#transform clr
clr_phy = transform_sample_counts(phy_filt, function(x) compositions::clr(x))
clr_mat = data.frame(as.matrix(otu_table(clr_phy)))
clr_mat$SampleID = rownames(clr_mat)
We begin by exploring a PCoA ordination based on Bray-Curtis dissimilarity to assess overall community structure.
Panel A shows little separation between the Fatty Acid and Control diet groups, suggesting that diet alone does not strongly structure the microbial communities along the first two principal coordinates.
In Panel B, however, we observe a tendency for pre-diet samples to cluster toward positive values on Axis 3, while post-treatment samples shift toward negative values, indicating a temporal effect.
Panel C confirms that the difference in Axis 3 scores between pre- and post-treatment samples is statistically significant. Additionally, the post-treatment group exhibits greater dispersion, which may reflect increased heterogeneity or simply a larger number of samples.
When we look at between group differences for G1 to G2 we see marginal significance when looking at Axis 3 for G1 vs. G2 but no differences for G3 vs G4
In panel E, when stratifying Axis 1 scores by both diet and time, we see that we can detect a significant difference between time point 2 and 3 for the control but not the fatty acid group.
# Ordination and data prep
ord <- ordinate(phy_filt, method = "PCoA", distance = "bray")
df <- data.frame(sample_data(phy_filt), ord$vectors)
df$Mygroup <- ifelse(df$Group %in% c("G1", "G2"), "pre", "post")
df$Time_label <- factor(df$Time_label, levels = c("T1", "T2", "T3"))
# Eigenvalues and proportion explained
prop_explained <- round(100 * ord$values$Eigenvalues / sum(ord$values$Eigenvalues), 2)
# Color palette
mycolors <- c(
G1 = "#D6CDEA",
G2 = "#B39DDB",
G3 = "#C8E6C9",
G4 = "#81C784",
G5 = "#FFD1DC",
G6 = "#F48FB1"
)
# Helper function for axis labels
axis_label <- function(axis_num) {
paste0("Axis ", axis_num, ": ", prop_explained[axis_num], " %")
}
# Base plot layers for boxplots with stats & points
base_boxplot <- function(data, x, y, color_group, facet_var = NULL, y_label = NULL, hide_legend = FALSE) {
p <- ggplot(data, aes_string(x = x, y = y, color = color_group)) +
geom_boxplot() +
geom_point(position = position_jitter(width = 0.1)) +
stat_compare_means()
if (!is.null(facet_var)) {
p <- p + facet_wrap(as.formula(paste("~", facet_var)))
}
if (!is.null(y_label)) p <- p + ylab(y_label)
if (hide_legend) p <- p + theme(legend.position = "none")
p
}
########## Scatter plots
#Axis 1 vs 2
p1 <- ggplot(df, aes(Axis.1, Axis.2, color = Time_label)) +
geom_point() +
xlab(axis_label(1)) + ylab(axis_label(2)) +
scale_color_manual(values=time_colors)
#Axis 1 vs 3
p2 <- ggplot(df, aes(Axis.1, Axis.3, color = Time_label, shape = Diet)) +
geom_point() +
xlab(axis_label(1)) + ylab(axis_label(3))+
scale_color_manual(values=time_colors)
# Boxplots with faceting and stats
df$Mygroup_plotting = plyr::revalue(df$Mygroup, c("post"="Post", "pre"="Pre"))
df$Mygroup_plotting = factor(df$Mygroup_plotting, levels=c("Pre", "Post"))
p3 <- base_boxplot(df, "Mygroup_plotting", "Axis.3", "Mygroup_plotting",
facet_var = "Diet", y_label = axis_label(3), hide_legend=TRUE) +
xlab("")
#get rid of the pre-time period and see if there's significance -- there is on axis 3
#not shown here but no distinction on axis 1 or 2
p4 <- base_boxplot(subset(df, Mygroup_plotting != "Pre"), "Time_label", "Axis.3", "Time_label",
facet_var = "Diet", y_label = axis_label(3), hide_legend = TRUE) +
scale_color_manual(values=c("#84948E","#A68B6B"))
# Pairwise comparisons subset plots
#G1 vs G2 is marginally significant
g1g2a3 <- df %>%
filter(Group %in% c("G1", "G2")) %>%
base_boxplot("Group", "Axis.3", "Group", y_label = axis_label(3))
g3g4a1 <- df %>%
filter(Group %in% c("G3", "G4")) %>%
base_boxplot("Group", "Axis.4", "Group", y_label = axis_label(4))
# Combine plots
plot_grid(
p1, p2, p3, p4,
labels = c("A", "B", "C", "D"),
ncol = 2,
align = "hv"
)
To further explore community structure, we applied UniFrac, a distance metric that incorporates both taxonomic abundance and phylogenetic relationships among taxa. When we look at the ordination on axis 1 and 2, we see that there does seem to be some segregation of pre-post samples, but no distinct clustering.
Using PERMANOVA (adonis), if we do not consider cage effects, we found that diet was not a significant factor, but there was a significant difference between pre- and post-treatment samples, as well as an interaction between diet and time, indicating a temporal shift in microbial composition that is dependent on the diet.
when we do permutations considering cage effects, we see that all three variables are significant. This tells us that samples from the same cage are more similar to each other (are not independent). Controlling for cage reduces noise and clarifies that diet really matters. This tells us diet influences how the community changes over time, but this effect only becomes clear when controlling for cage effects.
Time F=4.1, Diet = 0.9, Timexdiet interaction is 2.8
# Define Main.Factor based on Group
sample_data(phy_filt)$Main.Factor <- ifelse(
sample_data(phy_filt)$Group %in% c("G1", "G2"),
"pre",
"post"
)
# Convert Day to factor
sample_data(phy_filt)$Day <- as.factor(sample_data(phy_filt)$Day)
# Extract metadata as data frame
metadata <- data.frame(sample_data(phy_filt))
# Calculate weighted, normalized UniFrac distance
unifrac_dist <- UniFrac(phy_filt, weighted = TRUE, normalized = TRUE, fast = TRUE)
# Perform PCoA ordination based on UniFrac distance
ord <- ordinate(phy_filt, method = "PCoA", distance = unifrac_dist)
# Create combined data frame for plotting
df <- data.frame(
ord$vectors,
metadata,
Depth = sample_sums(phy_filt)
)
# Convert Time_label to numeric for plotting size or ordering if needed
df$Time_label <- factor(df$Time_label, levels=c("T1", "T2", "T3"))
# Plot PCoA with points sized by sequencing depth, colored by Time_label, shaped by Diet
#use a defined color scheme
my_colors <- c("#0072B2", "#E69F00", "#009E73")
#make the figure
Fig1C <- ggplot(df, aes(x = Axis.1, y = Axis.2, shape = Diet, color = Time_label, size = Depth)) +
geom_point(alpha = 0.8) +
facet_wrap(~ Diet) +
xlab(axis_label(1)) + ylab(axis_label(2)) +
scale_color_manual(values = my_colors) +
guides(
shape = guide_legend(order = 1),
color = guide_legend(order = 2),
size = guide_legend(order = 3)
) +
theme(text = element_text(family = "Helvetica"))
print(Fig1C)
ggsave(Fig1C, file="~/Desktop/github/perez_diet/Fig1C.pdf", device="pdf", width = 8, height = 6)
let’s use a permutational analysis of variance to test for significance
# PERMANOVA to test effects of Time_label and Diet on UniFrac distances
adonis_result <- adonis2(
unifrac_dist ~ Time_label * Diet,
data = metadata,
permutations = 999, by="term",
strata = as.factor(metadata$Cage)
)
print(adonis_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist ~ Time_label * Diet, data = metadata, permutations = 999, by = "term", strata = as.factor(metadata$Cage))
## Df SumOfSqs R2 F Pr(>F)
## Time_label 2 0.19571 0.12231 5.0566 0.002 **
## Diet 1 0.01774 0.01109 0.9169 0.003 **
## Time_label:Diet 2 0.10948 0.06842 2.8287 0.041 *
## Residual 66 1.27720 0.79819
## Total 71 1.60013 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We also assessed beta dispersion to test for differences in within-group variability, testing within groups (pre, post) as well as by diet and by Time_label. The results were not significant, suggesting that the observed differences in community composition are not due to differences in variability, but rather reflect true compositional shifts. The post samples may have greater dispersion because of the greater number of samples in that group
bd <- betadisper(unifrac_dist, metadata$Diet)
anova(bd)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 1 0.002323 0.0023232 0.5262 0.4706
## Residuals 70 0.309047 0.0044150
plot(bd)
what happens when we drop the pre samples? when we drop the pre samples, then time is not significant. diet is significant, and the time x diet interaction is significant.
myphy = subset_samples(phy_filt, Main.Factor != "pre") %>%
prune_taxa(taxa_sums(.) > 0, .)
# Convert Day to factor
sample_data(myphy)$Day <- as.factor(sample_data(myphy)$Day)
# Extract metadata as data frame
metadata_post <- data.frame(sample_data(myphy))
# Calculate weighted, normalized UniFrac distance
unifrac_dist_post <- UniFrac(myphy, weighted = TRUE, normalized = TRUE, fast = TRUE)
# Perform PCoA ordination based on UniFrac distance
ord_post <- ordinate(myphy, method = "PCoA", distance = unifrac_dist_post)
# Create combined data frame for plotting
df_post <- data.frame(
ord_post$vectors,
metadata_post,
Depth = sample_sums(myphy)
)
# Convert Time_label to numeric for plotting size or ordering if needed
df_post$Time_numeric <- as.numeric(as.factor(df_post$Time_label))
# Plot PCoA with points sized by sequencing depth, colored by Time_label, shaped by Diet
p <- ggplot(df_post, aes(x = Axis.1, y = Axis.2, color = Time_label, shape = Diet)) +
geom_point(size=3) +
facet_wrap(~ Diet)+
guides(
shape = guide_legend(order = 1), # Diet first
color = guide_legend(order = 2) # Time second
)
print(p)
# PERMANOVA to test effects of Time_label and Diet on UniFrac distances
adonis_result_post <- adonis2(
unifrac_dist_post ~ Time_label * Diet,
data = metadata_post,
permutations = 999, by="term",
strata = as.factor(metadata_post$Cage)
)
print(adonis_result_post)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist_post ~ Time_label * Diet, data = metadata_post, permutations = 999, by = "term", strata = as.factor(metadata_post$Cage))
## Df SumOfSqs R2 F Pr(>F)
## Time_label 1 0.03019 0.02692 1.3343 0.241
## Diet 1 0.01087 0.00970 0.4806 0.067 .
## Time_label:Diet 1 0.08487 0.07567 3.7506 0.049 *
## Residual 44 0.99562 0.88772
## Total 47 1.12155 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Let’s examine how pre- (baseline) and post-treatment samples segregate by diet along Axis 1 and Axis 2. We return to the complete dataset. In the Control group, we observe a clear separation between pre- and post-treatment samples along Axis 1, suggesting a temporal shift in community composition. In contrast, for the Fatty Acid group, this separation is more apparent along Axis 2, indicating that the dietary effect may manifest along a different compositional gradient. This is also detected for the control group on axis 2.
# Combine PCoA ordination vectors with sample metadata
UniFrac.df <- data.frame(ord$vectors, sample_data(phy_filt))
#get the eigenvalues
eig = 100*(ord$values$Eigenvalues/sum(ord$values$Eigenvalues))
# Set Time_label as an ordered factor
UniFrac.df$Time_label <- factor(UniFrac.df$Time_label, levels = c("T1", "T2", "T3"))
# Boxplot of Axis.1 by Time_label, filled by Main.Factor, faceted by Diet
p1 <- ggplot(UniFrac.df, aes(x = Time_label, y = Axis.1, fill = Main.Factor)) +
geom_boxplot() +
facet_wrap(~ Diet) +
stat_compare_means() +
theme(legend.position = "none") +
ylab(paste0("Axis 1: ", round(eig[1], 2), "%")) + xlab("")
# Boxplot of Axis.2 by Time_label, filled by Main.Factor, faceted by Diet
p2 <- ggplot(UniFrac.df, aes(x = Time_label, y = Axis.2, fill = Main.Factor)) +
geom_boxplot() +
facet_wrap(~ Diet) +
stat_compare_means() +
theme(legend.position = "none") +
ylab(paste0("Axis 2: ", round(eig[2], 2), "%")) + xlab("")
# Arrange plots vertically
gridExtra::grid.arrange(p1, p2, ncol = 1)
To identify which taxa may be driving the patterns observed in the ordination, we examined correlations between species abundances and the ordination axes. Specifically, we applied a centered log-ratio (CLR) transformation to the abundance data to account for compositionality, and then performed linear regressions of each taxon’s CLR-transformed abundance against Axis 1 and Axis 2 scores independently.
To control for multiple testing, we applied a false discovery rate (FDR) correction with a significance threshold of 0.01. This identified 58 taxa that are significantly associated with either axis and may underlie the observed shifts in community structure.
# Prepare ordination data frame with SampleID and first two axes
ordination_df <- as.data.frame(ord$vectors) %>%
rownames_to_column("SampleID") %>%
dplyr::select(SampleID, Axis.1, Axis.2)
# Join CLR-transformed taxa data with ordination coordinates
clr_ord_df <- left_join(ordination_df, clr_mat, by = "SampleID")
# Identify taxa columns by excluding ordination columns
taxa_cols <- setdiff(colnames(clr_ord_df), c("SampleID", "Axis.1", "Axis.2"))
# Function to run linear regression of each taxon against a specified axis
run_taxon_regression <- function(axis, df, taxa_cols) {
map_dfr(taxa_cols, function(taxon) {
model <- lm(as.formula(paste(axis, "~", taxon)), data = df)
tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(Axis = axis, Taxon = taxon)
})
}
# Run regressions for Axis.1 and Axis.2 and adjust p-values for multiple testing
axis1_results <- run_taxon_regression("Axis.1", clr_ord_df, taxa_cols) %>%
mutate(p.adj = p.adjust(p.value, method = "fdr"))
axis2_results <- run_taxon_regression("Axis.2", clr_ord_df, taxa_cols) %>%
mutate(p.adj = p.adjust(p.value, method = "fdr"))
# Combine results from both axes
results_all <- bind_rows(axis1_results, axis2_results)
# Prepare taxonomy data frame and join with regression results
tax_df <- data.frame(phy_filt@tax_table@.Data, Taxon = rownames(tax_table(phy_filt)))
results_annotated <- left_join(results_all, tax_df, by = "Taxon")
# Prepare volcano plot data from regression results
volcano_data <- results_annotated %>%
mutate(
neg_log10_padj = -log10(p.adj),
Significant = p.adj < 0.05
)
# Basic volcano plot
ggplot(volcano_data, aes(x = estimate, y = neg_log10_padj, label=Genus)) +
geom_point(aes(color = Significant), alpha = 0.7) + geom_text_repel()+
scale_color_manual(values = c("grey60", "red")) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
labs(
x = "Effect Size (Estimate)",
y = "-log10(Adjusted P-value)",
color = "Significant"
) +
theme_minimal() +
theme(
legend.position = "top"
)
#we see that there are 13 taxa that are significantly correlated with axis 1 or 2
foo = subset(results_all, p.adj < 0.05)
length(foo$Taxon)
## [1] 13
let’s filter to get significant genera and then test for explicit differences across time. this identifies 6 of the 13 taxa correlated with axis 1 and 2 as significantly variable across time
Lactobacillus
HOA diet: No significant changes over time (all p.adj > 0.5).
Control diet: Significant changes between T1–T2 (p.adj = 0.003) and T2–T3 (p.adj = 0.018). T1–T3 not significant.
Clostridium.XlVa
HOA diet: No significant changes (lowest p.adj = 0.085; marginal).
Control diet: Significant changes at T1–T2 (p.adj = 0.036) and T1–T3 (p.adj = 0.036). T2–T3 not significant.
Barnesiella
HOA diet: Significant changes between T1–T2 (p.adj = 0.022) and T1–T3 (p.adj = 0.003). T2–T3 not significant.
Control diet: Significant changes at T1–T2 (p.adj = 0.0008), T1–T3 (p.adj = 0.017), and T2–T3 (p.adj = 0.017).
Mucispirillum
HOA diet: No significant changes.
Control diet: Significant change between T1–T2 (p.adj = 0.03). Others not significant.
Prevotella
HOA diet: Significant changes at T1–T2 (p.adj = 0.002) and T1–T3 (p.adj = 0.0008). T2–T3 not significant.
Control diet: Significant changes at T1–T2 (p.adj = 0.0003) and T1–T3 (p.adj = 0.0006). T2–T3 not significant.
Bacteroides
HOA diet: Significant changes between T1–T2 (p.adj = 0.025) and T1–T3 (p.adj = 0.043). T2–T3 not significant.
Control diet: Significant differences between T1–T2 (p.adj = 0.015) and T1–T3 (p.adj = 0.001). T2–T3 not significant.
Turicibacter
HOA diet: No significant changes (all p.adj > 0.8).
Control diet: Significant decreases between T1–T3 (p.adj = 0.003) and T2–T3 (p.adj = 0.022).
Parabacteroides
No significant changes in either diet (all p.adj > 0.2).
Bifidobacterium
No significant changes in either diet (all p.adj > 0.1).
Clostridium.IV
No significant changes in either diet (all p.adj > 0.7).
Clostridium.XVIII
No significant changes in either diet (all p.adj > 0.6).
mycomparisons <- combn(c("T1", "T2", "T3"), 2, simplify = FALSE)
mycolorpalette =c(
"Lactobacillus" = "#aec7e8",
"Bacteroides" = "#2ca02c",
"Barnesiella" = "#d62728",
"Turicibacter" = "#c49c94",
"Prevotella" = "#e377c2",
"Clostridium XlVa" = "#ff9896",
"Mucispirillum"="gray"
)
# Create taxonomy data frame with Taxon as rownames
tax_df <- data.frame(phy_filt@tax_table@.Data, Taxon = rownames(tax_table(phy_filt)))
# Filter significant taxa and get distinct Taxon-Genus pairs
sig_taxa <- results_annotated %>%
filter(p.adj < 0.05) %>%
distinct(Taxon, Genus)
# Ensure SampleID column exists in metadata
metadata$SampleID <- rownames(metadata)
# Convert CLR matrix to long format, filter to significant taxa, join taxonomy and metadata
clr_long <- clr_mat %>%
pivot_longer(
cols = -SampleID,
names_to = "Taxon",
values_to = "CLR"
) %>%
filter(Taxon %in% sig_taxa$Taxon) %>%
left_join(sig_taxa, by = "Taxon") %>%
left_join(metadata, by = "SampleID")
####plot pairwise comparisons
# ----- Step 1: Define significant genera -----
keep <- unique(sig_taxa$Genus)
# ----- Step 2: Normalize to relative abundance -----
psra <- transform_sample_counts(ps, function(x) x / sum(x))
# ----- Step 3: Extract taxonomy and rename taxa to Genus -----
tax <- as.data.frame(tax_table(psra))
taxa_names(psra) <- tax$Genus
# ----- Step 4: Subset phyloseq to only the significant genera -----
psra_keep <- subset_taxa(psra, Genus %in% keep)
# ----- Step 5: Melt the phyloseq object to a data frame -----
df <- data.frame(otu_table(psra_keep), sample_data(psra_keep)) %>%
reshape2::melt(., colnames(sample_data(psra_keep)))
# ----- Step 6: Clean up -----
df$Time_label <- factor(df$Time_label, levels = c("T1", "T2", "T3"))
df <- df %>% dplyr::rename(Genus = variable)
# ----- Step 7: Wilcoxon tests with multiple comparisons correction -----
pval_df <- df %>%
group_by(Genus, Diet) %>%
pairwise_wilcox_test(
value ~ Time_label,
p.adjust.method = "BH",
comparisons = mycomparisons
) %>%
add_significance("p.adj") %>%
add_y_position()
# ----- Step 8: Plot -----
plot_genus_pairwise <- function(genus_name, df, mycomparisons) {
# Filter data to the specified genus
df_sub <- df %>% filter(Genus == genus_name)
# Check if there is data for this genus
if (nrow(df_sub) == 0) {
stop(paste("No data found for genus:", genus_name))
}
# Perform pairwise Wilcoxon tests for this genus grouped by Diet
pval_df <- df_sub %>%
group_by(Diet) %>%
pairwise_wilcox_test(
value ~ Time_label,
p.adjust.method = "BH",
comparisons = mycomparisons
) %>%
add_significance("p.adj") %>%
add_y_position()
# Create the plot
p <- ggplot(df_sub, aes(x = Time_label, y = value)) +
geom_boxplot(aes(fill = Time_label), outlier.shape = NA) +
geom_point(position = position_jitter(width = 0.2), alpha = 0.5, size = 1.5) +
facet_wrap(~Diet, scales = "free_y") +
stat_pvalue_manual(
pval_df,
label = "p.adj.signif",
tip.length = 0.01,
size = 4
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none") +
scale_fill_brewer(palette = "Set2") +
labs(
title = paste(genus_name),
y = "Relative Abundance",
x = "Time Label"
)
return(p)
}
mygenera = sig_taxa$Genus
p1 = plot_genus_pairwise(mygenera[1], df, mycomparisons) # Lactobacillus
p2 = plot_genus_pairwise("Clostridium.XlVa", df, mycomparisons)
p3 = plot_genus_pairwise(mygenera[3], df, mycomparisons) # Barnesiella
p4 = plot_genus_pairwise(mygenera[4], df, mycomparisons)
p5 = plot_genus_pairwise(mygenera[5], df, mycomparisons) # Prevotella
p6 = plot_genus_pairwise(mygenera[6], df, mycomparisons)
p7 = plot_genus_pairwise(mygenera[7], df, mycomparisons)
p8 = plot_genus_pairwise(mygenera[8], df, mycomparisons)
p9 = plot_genus_pairwise(mygenera[9], df, mycomparisons)
p10 = plot_genus_pairwise("Clostridium.IV", df, mycomparisons)
p11 = plot_genus_pairwise(mygenera[11], df, mycomparisons)
p12 = plot_genus_pairwise("Clostridium.XVIII", df, mycomparisons)
myplot = cowplot::plot_grid(p1, p2, p3, p5, p7, p8, p11, ncol=2)
myplot
let’s look at whether we see a difference between T1 vs T1, T2 vs T2, T3 vs T3 by diet
4 genera showed significant differences in relative abundance between the control (AIN-93G) and HOA diets. At T2, Mucispirillum was significantly more abundant in the control group compared to the HOA group (p.adj = 0.012), and Romboutsia also showed a significant difference favoring the control diet (p.adj = 0.039). However, it’s important to note that these taxa are present at less than. 0.1 relative abundance, so this could just be noise. At T1, Barnesiella exhibited higher abundance in the HOA group (p.adj = 0.033), indicating an early diet effect, which may have influenced the trajectory of outcomes. Finally, at T3, Turicibacter was significantly more abundant in the HOA group than the control (p.adj = 0.033).
# Ensure Time_label is a factor
df$Time_label <- factor(df$Time_label, levels = c("T1", "T2", "T3"))
# Step 1: Run pairwise Wilcoxon tests by Genus and Time
diet_diff_results <- df %>%
group_by(Genus, Time_label) %>%
pairwise_wilcox_test(
value ~ Diet,
p.adjust.method = "BH"
) %>%
add_significance("p.adj")
# Step 2: Filter for significant genera
significant_genera <- diet_diff_results %>%
filter(p.adj < 0.05) %>%
pull(Genus) %>%
unique()
# Step 3: Merge p-values with df for plotting
df_plot <- df %>%
left_join(diet_diff_results, by = c("Genus", "Time_label"))
# Step 4: Define plotting function
plot_genus_diet_comparison <- function(genus_name, df_plot) {
df_sub <- df_plot %>% filter(Genus == genus_name)
if (nrow(df_sub) == 0) {
stop(paste("No data found for genus:", genus_name))
}
# p-value annotations per Time_label
pval_df <- df_sub %>%
dplyr::select(Time_label, p.adj, p.adj.signif) %>%
distinct() %>%
drop_na(p.adj) %>%
mutate(
group1 = "HOA",
group2 = "control (AIN-93G)",
y.position = max(df_sub$value, na.rm = TRUE) * 1.1
)
ggplot(df_sub, aes(x = Diet, y = value)) +
geom_boxplot(aes(fill = Diet), outlier.shape = NA) +
geom_point(position = position_jitter(width = 0.2), alpha = 0.5, size = 1.5) +
facet_wrap(~Time_label, scales = "free_y") +
stat_pvalue_manual(
pval_df,
label = "p.adj.signif",
tip.length = 0.01,
size = 4
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none") +
scale_fill_manual(values = c("HOA" = "#1f77b4", "control (AIN-93G)" = "#ff7f0e")) +
labs(
title = paste0("*", genus_name, "*"),
y = "Relative Abundance",
x = "Diet"
) +
theme(plot.title = element_markdown(size = 14))
}
# Step 5: Generate plots for all significant genera
plot_list <- lapply(significant_genera, function(genus) {
plot_genus_diet_comparison(genus, df_plot)
})
# Step 6: Combine all plots
cowplot::plot_grid(plotlist = plot_list, ncol = 2)
To better understand whether the interaction between time and diet is primarily driven by differences between pre- and post-treatment samples, we repeated the PERMANOVA analysis after excluding the pre-treatment (T1) samples and controlling for cage effects. In this post-treatment subset (T2 and T3), the time × diet interaction remained marginally significant, while the main effect of time was not, and the effect of diet was marginally significant. This suggests that microbial communities shift differently from T2 to T3 depending on the diet.
We also observed a horseshoe-shaped pattern in the ordination, which is often indicative of a gradient structure in the data. Notably, this pattern does not appear to be driven by differences in sequencing depth, which is interesting and warrants further exploration. Typically when I see a horsehoe it tells me there’s an underlying difference between samples in sequencing depth.
phy_filt_subset = subset_samples(phy_filt, Main.Factor=="post") %>%
prune_taxa(taxa_sums(.) > 0, .) %>%
prune_samples(sample_sums(.) > 0, .)
metadata = data.frame(sample_data(phy_filt_subset))
unifrac_dist <- UniFrac(phy_filt_subset, weighted = TRUE, normalized = TRUE, fast = TRUE)
ord <- ordinate(phy_filt_subset, method = "PCoA", distance = unifrac_dist)
df = data.frame(ord$vectors, sample_data(phy_filt_subset), Depth=sample_sums(phy_filt_subset))
df$Time_numeric = as.numeric(as.factor(df$Time_label))
p = ggplot(df, aes(Axis.1, Axis.2, color=Group, shape=Diet)) + geom_point()
p = ggplot(df, aes(Axis.1, Axis.2, color=Time_label, shape=Diet)) + geom_point() + facet_wrap(~Diet)
#note that diet is not significant
#day is significant
adonis_result <- adonis2(unifrac_dist ~ Time_label*Diet, data = metadata, by="term")
print(adonis_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist ~ Time_label * Diet, data = metadata, by = "term")
## Df SumOfSqs R2 F Pr(>F)
## Time_label 1 0.03019 0.02692 1.3343 0.240
## Diet 1 0.01087 0.00970 0.4806 0.533
## Time_label:Diet 1 0.08487 0.07567 3.7506 0.040 *
## Residual 44 0.99562 0.88772
## Total 47 1.12155 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis_result <- adonis2(
unifrac_dist ~ Time_label * Diet,
data = metadata,
permutations = 999, by="term",
strata = as.factor(metadata$Cage)
)
print(adonis_result)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: strata
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = unifrac_dist ~ Time_label * Diet, data = metadata, permutations = 999, by = "term", strata = as.factor(metadata$Cage))
## Df SumOfSqs R2 F Pr(>F)
## Time_label 1 0.03019 0.02692 1.3343 0.247
## Diet 1 0.01087 0.00970 0.4806 0.067 .
## Time_label:Diet 1 0.08487 0.07567 3.7506 0.032 *
## Residual 44 0.99562 0.88772
## Total 47 1.12155 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now we want to do more hypothesis testing rather than explore the data. Towards this end, we will use an ordination that let’s us quantify the amount of variation in the data that the factors diet and time explain. We’ll assess how much variation is explained by diet, whether there’s a temporal shift in community structure, and whether diet and time interact in any interesting ways. Importantly, what we do here is model variation across diet and time, rather than looking at total variance
However we don’t control for cage effects. In this analysis, we see that the time effect and the time by diet interaction is significant.
# Step 1: Transform to relative abundance
ps_rel <- transform_sample_counts(phy_filt, function(x) x / sum(x))
# Step 2: Extract OTU matrix (samples as rows, taxa as columns)
otu_mat <- as(otu_table(ps_rel), "matrix")
if (taxa_are_rows(ps_rel)) otu_mat <- t(otu_mat)
# Step 3: Extract sample metadata and map cage IDs to cage names
meta <- as(sample_data(ps_rel), "data.frame")
id_to_cage <- c(
"82084" = "Cage1", "82085" = "Cage2", "82086" = "Cage3", "82087" = "Cage4",
"85061" = "Cage5", "85062" = "Cage6", "85063" = "Cage7", "85064" = "Cage8",
"104184" = "Cage9", "104185" = "Cage10", "104186" = "Cage11", "104187" = "Cage12"
)
meta$Cage.new <- plyr::revalue(as.factor(meta$Cage), id_to_cage)
# Step 4: Match samples between OTU matrix and metadata
common_samples <- intersect(rownames(otu_mat), rownames(meta))
otu_mat_clean <- otu_mat[common_samples, ]
meta_clean <- meta[common_samples, ]
meta_clean$SampleID <- rownames(meta_clean)
# Step 5: Remove samples with missing Diet or Day
meta_clean <- meta_clean %>% filter(!is.na(Diet), !is.na(Day))
otu_mat_clean <- otu_mat_clean[rownames(meta_clean), ]
# Step 6: Compute Bray–Curtis distance matrix
bray_dist_clean <- vegdist(otu_mat_clean, method = "bray")
# Step 7: Run db-RDA (constrained ordination)
meta_clean$Group <- relevel(factor(meta_clean$Group), ref = "G1")
db_rda <- capscale(bray_dist_clean ~ Time_label * Diet, data = meta_clean, comm = otu_mat_clean)
# Extract eigenvalues and percent variance explained
eig_vals <- eigenvals(db_rda)
prop_explained <- round(100 * (eig_vals / sum(eig_vals)), 2)
# Step 8: Summarize and test db-RDA model
#summary(db_rda)
anova.cca(db_rda, permutations = 999, by = "term")
## Permutation test for capscale under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: capscale(formula = bray_dist_clean ~ Time_label * Diet, data = meta_clean, comm = otu_mat_clean)
## Df SumOfSqs F Pr(>F)
## Time_label 2 0.6402 6.1624 0.001 ***
## Diet 1 0.0548 1.0544 0.321
## Time_label:Diet 2 0.1854 1.7845 0.064 .
## Residual 66 3.4282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract taxon scores
taxon_scores <- scores(db_rda, display = "species")
# Step 9: Extract sample scores and join metadata for plotting
sample_scores <- scores(db_rda, display = "sites") %>%
as.data.frame() %>%
rownames_to_column("SampleID")
plot_df <- left_join(sample_scores, meta_clean, by = "SampleID")
# Step 10: Define factor for pre/post groups
plot_df$myfactor <- ifelse(plot_df$Group %in% c("G1", "G2"), "pre", "post")
plot_df$Time_numeric <- as.numeric(as.factor(plot_df$Time_label))
# Step 11: PCoA scatterplot colored by pre/post and shaped by Diet
dba_plot <- ggplot(plot_df, aes(x = CAP1, y = CAP2, color = myfactor, shape = Diet)) +
geom_point(alpha = 0.8, size = 2) +
labs(
x = paste0("CAP1: ", prop_explained[1], " %"),
y = paste0("CAP2: ", prop_explained[2], " %"),
color = "Group"
) +
theme_minimal()
# Step 12: Boxplots comparing groups with stats
# Helper function for group comparisons
plot_group_boxplot <- function(data, group_var, y_var = "CAP1", facet_var = NULL, title = NULL) {
p <- ggplot(data, aes_string(x = group_var, y = y_var)) +
geom_boxplot() +
theme_minimal() +
stat_compare_means()
if (!is.null(facet_var)) {
p <- p + facet_wrap(as.formula(paste("~", facet_var)))
}
if (!is.null(title)) {
p <- p + ggtitle(title)
}
p
}
# Plot G1 vs G2
plot_g1g2 <- plot_group_boxplot(subset(plot_df, Group %in% c("G1", "G2")), "Group")
# Plot G3 vs G4
plot_g3g4 <- plot_group_boxplot(subset(plot_df, Group %in% c("G3", "G4")), "Group")
# Plot Day -1 vs 10 with facet by Diet
plot_day <- plot_group_boxplot(
subset(plot_df, Day %in% c("-1", "10")),
group_var = "Day",
facet_var = "Diet",
title = "CAP1 by Day and Diet"
)
# You can now print or save the plots:
# print(dba_plot)
# print(plot_g1g2)
# print(plot_g3g4)
# print(plot_day)
let’s identify taxa that are driving the segregation of samples along axis 1 and 2
# Extract species scores from db-RDA
taxon_scores <- scores(db_rda, display = "species")
# Convert to data frame and clean
taxon_df <- as.data.frame(taxon_scores) %>%
tibble::rownames_to_column(var = "Taxon") %>%
mutate(across(where(is.list), ~ as.numeric(unlist(.)))) %>%
mutate(across(starts_with("CAP"), as.numeric))
# Extract taxonomy table from phyloseq object
tax_table_df <- as.data.frame(tax_table(phy_filt)) %>%
tibble::rownames_to_column(var = "Taxon")
# Merge with taxon_df to get genus names
taxon_annotated <- left_join(taxon_df, tax_table_df, by = "Taxon")
# Replace missing genus with "Unclassified"
taxon_annotated$Genus <- ifelse(is.na(taxon_annotated$Genus), "Unclassified", taxon_annotated$Genus)
# ---- Plot for CAP1 ----
top_taxa_CAP1 <- taxon_annotated %>%
dplyr::arrange(desc(abs(CAP1))) %>%
dplyr::slice(1:20)
db1 = ggplot(top_taxa_CAP1, aes(x = reorder(Genus, CAP1), y = CAP1)) +
geom_bar(stat = "identity", fill = "darkorange") +
coord_flip() +
labs(title = "Top 20 Taxa Contributing to db-RDA Axis 1",
x = "Genus",
y = "CAP1 Score") +
theme_minimal()+
theme(axis.text.y = element_text(face = "italic"))
# ---- Plot for CAP2 ----
top_taxa_CAP2 <- taxon_annotated %>%
dplyr::arrange(desc(abs(CAP2))) %>%
dplyr::slice(1:20)
db2 = ggplot(top_taxa_CAP2, aes(x = reorder(Genus, CAP2), y = CAP2)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Taxa Contributing to db-RDA Axis 2",
x = "Genus",
y = "CAP2 Score") +
theme_minimal()+
theme(axis.text.y = element_text(face = "italic"))
grid.arrange(db1, db2, ncol=2)
SFig2 = plot_grid(db1, db2, ncol=2, labels = c("a", "b"))
ggsave(SFig1, file="~/Desktop/github/perez_diet/SFig2.pdf", device = pdf, width = 12, height = 8)
let’s plot relative abundance of taxa associated with CAP1 and CAP 2 plot relative abundance of significant taxa we write a small function to plot the genera individually because otherwise ggsig won’t work
# Subset to Lactobacillus and Bacteroides
ps_subset <- subset_taxa(phy_filt, Genus %in% c("Lactobacillus", "Bacteroides",
"Barnesiella", "Turicibacter",
"Prevotella", "Clostridium XlVa"))
# Transform to relative abundance
ps_rel <- transform_sample_counts(ps_subset, function(x) x / sum(x))
# Melt to long format for ggplot
ps_melt <- psmelt(ps_rel)
# Optional: clean up Genus names if needed
ps_melt$Genus <- as.character(ps_melt$Genus)
ps_melt$Genus[is.na(ps_melt$Genus)] <- "Unclassified"
ps_melt$Time_label = factor(ps_melt$Time_label, levels=c("T1", "T2", "T3"))
#subset on taxa
Turicibacter = subset(ps_melt, Genus=="Turicibacter")
Bacteroides = subset(ps_melt, Genus=="Bacteroides")
Lactobacillus = subset(ps_melt, Genus=="Lactobacillus")
Barnesiella = subset(ps_melt, Genus=="Barnesiella")
Prevotella = subset(ps_melt, Genus=="Prevotella")
Clostridium = subset(ps_melt, Genus=="Clostridium XlVa")
mycolorpalette =c(
"Lactobacillus" = "#aec7e8",
"Bacteroides" = "#2ca02c",
"Barnesiella" = "#d62728",
"Turicibacter" = "#c49c94",
"Prevotella" = "#e377c2",
"Clostridium XlVa" = "#ff9896"
)
#plot by genus
plot_my_taxa <- function(df, species) {
# Filter for the target genus
df_sub <- df %>% filter(Genus == species)
# Wilcoxon pairwise tests within each Diet, adjust p-values using BH
pval_df <- df_sub %>%
group_by(Diet) %>%
pairwise_wilcox_test(
Abundance ~ Time_label,
p.adjust.method = "BH",
comparisons = mycomparisons
) %>%
ungroup() %>%
add_significance("p.adj") %>%
add_y_position(step.increase = 0.05) # automatic y-positioning
# Plot
ggplot(df_sub, aes(x = Time_label, y = Abundance, fill = Genus)) +
geom_boxplot(position = position_dodge(width = 0.8), outlier.shape = NA) +
geom_point(position = position_jitterdodge(jitter.width = 0.15, dodge.width = 0.8)) +
facet_wrap(~Diet, scales = "free_x") +
scale_fill_manual(values = mycolorpalette) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(face = "italic")
) +
xlab("") +
ylab("Relative Abundance") +
ggtitle(species) +
stat_pvalue_manual(
pval_df,
label = "p.adj.signif",
tip.length = 0.01,
size = 3
)
}
fig1d1 <- plot_my_taxa(Lactobacillus, species="Lactobacillus") + ylim(0, 0.9)
fig1d2 <- plot_my_taxa(Bacteroides, species="Bacteroides")+ ylim(0, 0.9)
fig1d3 <- plot_my_taxa(Barnesiella, species="Barnesiella")+ ylim(0, 0.9)
fig1d4 <- plot_my_taxa(Turicibacter, species="Turicibacter") + ylim(0, 0.4)
fig1d5 <- plot_my_taxa(Clostridium, species="Clostridium XlVa")+ ylim(0, 0.4)
fig1d6 <- plot_my_taxa(Prevotella, species="Prevotella")+ ylim(0, 0.4)
fig1D <- plot_grid(fig1d1, fig1d2, fig1d3, fig1d4, fig1d5, fig1d6,
ncol = 2, align = "hv")
fig1D
save figure 1d
ggsave(fig1D, file="~/Desktop/github/perez_diet/fig1D.pdf", device="pdf", height = 12, width = 7)
ggsave(fig1D, file="~/Desktop/github/perez_diet/fig1D.eps", device="eps", height = 12, width = 7)
plot_aldex_by_time <- function(phy_obj, time_point) {
# Filter samples by Time_label using prune_samples
samples_to_keep <- sample_names(phy_obj)[sample_data(phy_obj)$Time_label == time_point]
if(length(samples_to_keep) == 0) {
stop(paste("No samples found for Time_label =", time_point))
}
phy_sub <- prune_samples(samples_to_keep, phy_obj)
# Extract OTU counts and conditions
otu_mat <- as.data.frame(otu_table(phy_sub))
conds <- sample_data(phy_sub)$Diet
# Run ALDEx2 test
ald <- aldex(
t(otu_mat),
conds,
test = "t",
effect = TRUE,
denom = "all"
)
tax_df <- as.data.frame(tax_table(phy_sub)) %>% rownames_to_column("OTU")
ald_df <- as.data.frame(ald) %>%
rownames_to_column(var = "OTU") %>%
left_join(
dplyr::select(tax_df, OTU, Genus),
by = "OTU"
) %>%
mutate(
signif = wi.eBH < 0.05,
x = rab.all,
y = diff.btw
)
top_hits <- ald_df %>%
filter(signif) %>%
slice_max(order_by = abs(y), n = 10)
p <- ggplot(ald_df, aes(x = x, y = y)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_point(aes(color = signif), alpha = 0.6, size = 1.8) +
scale_color_manual(values = c("grey70", "firebrick")) +
geom_text_repel(
data = top_hits,
aes(label = Genus),
size = 3,
box.padding = 0.3,
point.padding = 0.2,
max.overlaps = 15
) +
labs(
title = paste("ALDEx2 Diet Comparison at", time_point),
x = "Median CLR Abundance",
y = paste("Median CLR Difference (HOA vs control at", time_point, ")"),
color = "Adj. p < 0.05"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "right")
return(p)
}
# Plot ALDEx2 results for each Time point; but only display T2 cause it's the only one with a significant hit
p_T1 <- plot_aldex_by_time(phy_filt, "T1")
p_T2 <- plot_aldex_by_time(phy_filt, "T2")
p_T3 <- plot_aldex_by_time(phy_filt, "T3")
p_T2
let’s plot the relative abundnace of Acetatifactor
phy_filt_ra = transform_sample_counts(phy_filt, function(x) x/sum(x))
phy_acetatifactor <- subset_taxa(phy_filt_ra, Genus == "Acetatifactor")
tax.count <- data.frame(sample_data(phy_acetatifactor), otu_table(phy_acetatifactor)) %>%
reshape2::melt(., colnames(sample_data(phy_acetatifactor)))
#plot
ggplot(tax.count, aes(x = Diet, y = round(100*value, 2), fill = Diet)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, size = 1.5, alpha = 0.6) +
facet_wrap(~ Time_label, scales = "free_x") +
labs(
title = "Relative Abundance of *Acetatifactor* by Diet",
y = "Relative Abundance",
x = "Diet"
) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Compose left side: fig1A (panel a), fig1B (panel b)
fig1_top <- ggdraw() +
# Panel a: left column
draw_plot(gig1B, x = 0, y = 0, width = 0.5, height = 1) +
draw_plot_label("B", x = 0.01, y = 0.98, size = 14, fontface = "bold") +
# Panel b: right column
draw_plot(Fig1C, x = 0.5, y = 0, width = 0.5, height = 1) +
draw_plot_label("C", x = 0.51, y = 0.98, size = 14, fontface = "bold")
fig1_bottom <- ggdraw() +
# Panel c: left column
draw_plot(fig1D, x = 0, y = 0, width = 0.5, height = 1) +
draw_plot_label("D", x = 0.01, y = 0.98, size = 14, fontface = "bold") +
# Panel d: right column
draw_plot(fig1E, x = 0.5, y = 0, width = 0.5, height = 1) +
draw_plot_label("d", x = 0.51, y = 0.98, size = 14, fontface = "bold")
# Combine the two columns
Fig1 <- plot_grid(
fig1_top,
fig1_bottom,
ncol = 1,
rel_heights = c(0.5, 1) # top is half the height of bottom
)
# Print
print(Fig1)
ggsave(Fig1, file="~/Desktop/github/perez_diet/Figure1.pdf", width = 11, height = 12)